#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* pooling_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"__kernel void pooling_c4_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int channel,\n"
" __private const int batch,\n"
" __private const int in_channel_block,\n"
" __private const int out_channel_block,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right) {\n"
" \n"
" const int ow_idx=get_global_id(0);\n"
" const int b_oh_idx=get_global_id(1);\n"
" const int c_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(ow_idx,b_oh_idx,c_idx);\n"
" \n"
" const int b_idx=b_oh_idx/output_shape.x;\n"
" const int oh_idx=b_oh_idx % output_shape.x;\n"
" const int iw_start=mad24(ow_idx,STRIDE_X,-pad_shape.y);\n"
" const int ih_start=mad24(oh_idx,STRIDE_Y,-pad_shape.x);\n"
" \n"
" #ifdef POOL_AVG\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
"#ifdef COUNT_INCLUDE_PADDING\n"
" int total_count=(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+KERNEL_X,input_shape.y+pad_shape.y)-iw_start);\n"
"#else\n"
" int total_count=0;\n"
"#endif\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" result += inp_data;\n"
"#ifndef COUNT_INCLUDE_PADDING\n"
" total_count++;\n"
"#endif\n"
" }\n"
" }\n"
" result=result/(COMPUTE_FLOAT4)(1.0*total_count);\n"
" #else\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
" #if RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" #endif\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" #if RETURN_REDICE\n"
" redice=inp_data>result ? (int4)((ih_start+kh)*input_shape.y+iw_start+kw) : redice;\n"
" #endif\n"
" result=fmax(result,inp_data);\n"
" }\n"
" }\n"
" #endif\n"
" \n"
" const int out_offset=(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
" vstore4(CONVERT_FLOAT4(result),0,output+out_offset);\n"
" #if RETURN_REDICE\n"
" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4);\n"
" #endif\n"
"}\n"
"__kernel void pooling_c4_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int channel,\n"
" __private const int batch,\n"
" __private const int in_channel_block,\n"
" __private const int out_channel_block,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right) {\n"
" \n"
" const int ow_idx=get_global_id(0);\n"
" const int b_oh_idx=get_global_id(1);\n"
" const int c_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(ow_idx,b_oh_idx,c_idx);\n"
" \n"
" const int b_idx=b_oh_idx/output_shape.x;\n"
" const int oh_idx=b_oh_idx % output_shape.x;\n"
" const int iw_start=mad24(ow_idx,STRIDE_X,-pad_shape.y);\n"
" const int ih_start=mad24(oh_idx,STRIDE_Y,-pad_shape.x);\n"
" const int dst_width=output_shape.y+output_pad_left+output_pad_right;\n"
" \n"
" #ifdef POOL_AVG\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
" #ifdef COUNT_INCLUDE_PADDING\n"
" int total_count=(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+KERNEL_X,input_shape.y+pad_shape.y)-iw_start);\n"
"#else\n"
" int total_count=0;\n"
"#endif\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" result += inp_data;\n"
"#ifndef COUNT_INCLUDE_PADDING\n"
" total_count++;\n"
"#endif\n"
" }\n"
" }\n"
" result=result/(COMPUTE_FLOAT4)(1.0*total_count);\n"
" #else\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
" #if RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" #endif\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" #if RETURN_REDICE\n"
" redice=inp_data>result ? (int4)((ih_start+kh)*input_shape.y+iw_start+kw) : redice;\n"
" #endif\n"
" result=fmax(result,inp_data);\n"
" }\n"
" }\n"
" #endif\n"
" const int c_left=(c_idx % 4)*4;\n"
" const int out_offset=(((b_idx*out_channel_block+c_idx/4)*output_shape.x+oh_idx)* dst_width+ow_idx+output_pad_left)*16+c_left;\n"
" vstore4(CONVERT_FLOAT4(result),0,output+out_offset);\n"
" #if RETURN_REDICE\n"
" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+(((b_idx*out_channel_block+c_idx)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4);\n"
" #endif\n"
" if(ow_idx == 0){\n"
" int pad_offset=(((b_idx*out_channel_block+c_idx/4)*output_shape.x+oh_idx)* dst_width+0)*16+c_left;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore4((FLOAT4)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (output_shape.y+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore4((FLOAT4)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void pooling_c16_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int channel,\n"
" __private const int batch,\n"
" __private const int in_channel_block,\n"
" __private const int out_channel_block,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right) {\n"
" \n"
" const int ow_idx=get_global_id(1) << 3;\n"
" const int b_oh_idx=get_global_id(2);\n"
" const int c_idx=get_group_id(0);\n"
" const int sglid=get_sub_group_local_id();\n"
" \n"
" const int b_idx=b_oh_idx/output_shape.x;\n"
" const int oh_idx=b_oh_idx % output_shape.x;\n"
" const int iw_start=mad24(ow_idx,STRIDE_X,-pad_shape.y);\n"
" const int ih_start=mad24(oh_idx,STRIDE_Y,-pad_shape.x);\n"
" const int src_width=input_shape.y+input_pad_left+input_pad_right;\n"
" const int dst_width=output_shape.y+output_pad_left+output_pad_right;\n"
"#ifdef POOL_AVG\n"
" COMPUTE_FLOAT8 result=(COMPUTE_FLOAT8)(0);\n"
" COMPUTE_FLOAT8 w_start=(COMPUTE_FLOAT8)(iw_start,iw_start+STRIDE_X,iw_start+STRIDE_X*2,iw_start+STRIDE_X*3,iw_start+STRIDE_X*4,iw_start+STRIDE_X*5,iw_start+STRIDE_X*6,iw_start+STRIDE_X*7);\n"
"#ifdef COUNT_INCLUDE_PADDING\n"
" COMPUTE_FLOAT8 w_size=fmin(w_start+KERNEL_X,input_shape.y+pad_shape.y)-w_start;\n"
" COMPUTE_FLOAT8 total_count=(COMPUTE_FLOAT8)(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*w_size;\n"
"#else\n"
" w_start=fmax(w_start,(COMPUTE_FLOAT8)0);\n"
" COMPUTE_FLOAT8 w_end=fmin(w_start+KERNEL_X,(COMPUTE_FLOAT8)input_shape.y);\n"
" float h_start=fmax((float)ih_start,0);\n"
" float h_end=fmin(h_start+KERNEL_Y,(float)input_shape.x);\n"
" COMPUTE_FLOAT8 total_count=(w_end-w_start)*(COMPUTE_FLOAT8)(h_end-h_start);\n"
"#endif\n"
"#else\n"
" COMPUTE_FLOAT8 result=(COMPUTE_FLOAT8)(-FLT_MAX);\n"
"#if RETURN_REDICE\n"
" int8 redice=(int8)0;\n"
"#endif\n"
"#endif\n"
" const int inp_offset=mul24(mad24(mad24(mad24(b_idx,in_channel_block,c_idx),input_shape.x,ih_start),src_width,iw_start+input_pad_left),16);\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" for (int i=0; i<INPUT_LINE_SIZE; i++) {\n"
" if ((iw_start+i) >= 0 && (iw_start+i)<input_shape.y){\n"
"#ifdef MNN_SUPPORT_FP16\n"
" line_cache[i]=as_half(intel_sub_group_block_read_us((__global ushort*)(input+inp_offset+mul24(mad24(kh,src_width,i),16))));\n"
"#else\n"
" line_cache[i]=as_float(intel_sub_group_block_read((__global uint*)(input+inp_offset+mul24(mad24(kh,src_width,i),16))));\n"
"#endif\n"
" } else{\n"
"#ifdef POOL_AVG\n"
" line_cache[i]=0;\n"
"#else\n"
" line_cache[i]=(COMPUTE_FLOAT)(-FLT_MAX);\n"
"#endif\n"
" }\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" COMPUTE_FLOAT8 src;\n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++) {\n"
" src[i]=line_cache[kw+STRIDE_X*i];\n"
" }\n"
"#ifdef POOL_AVG\n"
" result += src;\n"
"#else\n"
"#if RETURN_REDICE\n"
" redice=src>result ? (int8)((ih_start+kh)*input_shape.y+iw_start+kw) : redice;\n"
"#endif\n"
" result=fmax(result,src);\n"
"#endif\n"
" }\n"
" }\n"
"#ifdef POOL_AVG\n"
" result=result/total_count;\n"
"#endif\n"
" if(ow_idx == 0){\n"
" int pad_offset=(((b_idx*out_channel_block+c_idx)*output_shape.x+oh_idx)* dst_width+0)*16+sglid;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*16]=0;\n"
" }\n"
" pad_offset += (output_shape.y+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*16]=0;\n"
" }\n"
" }\n"
" \n"
" const int out_offset=(((b_idx*out_channel_block+c_idx)*output_shape.x+oh_idx)* dst_width+ow_idx+output_pad_left)*16;\n"
"#if OUTPUT_LEFTOVERS\n"
" if ((c_idx+1)*16 >= channel) {\n"
" for (int i=0; i<8; i++) {\n"
" if ((c_idx*16+sglid<channel) && (ow_idx+i)<output_shape.y)\n"
" output[out_offset+i*16+sglid]=result[i];\n"
" }\n"
" }\n"
" else\n"
"#endif \n"
" {\n"
" if (ow_idx+8 <= output_shape.y) {\n"
"#ifdef MNN_SUPPORT_FP16\n"
" intel_sub_group_block_write_us8((__global ushort*)(output+out_offset),as_ushort8(CONVERT_FLOAT8(result)));\n"
"#else\n"
" intel_sub_group_block_write8((__global uint*)(output+out_offset),as_uint8(CONVERT_FLOAT8(result)));\n"
"#endif\n"
" }else{\n"
" for (int i=0; i<output_shape.y % 8; i++) {\n"
" output[out_offset+i*16+sglid]=result[i];\n"
" }\n"
" }\n"
" }\n"
"#ifdef RETURN_REDICE\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" \n"
" const int width_height=output_shape.y*output_shape.x*4;\n"
" const int redice_offset=(((b_idx*out_channel_block+c_idx*4)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
"#if OUTPUT_LEFTOVERS\n"
" if ((c_idx+1)*16 >= channel) {\n"
" for (int i=0; i<8; i++) {\n"
" if ((c_idx*16+lid_y*4+lid_x<channel) && (ow_idx+i)<output_shape.y)\n"
" rediceOutput[redice_offset+lid_y*width_height+i*4+lid_x]=redice[i];\n"
" }\n"
" }\n"
" else\n"
"#endif\n"
" {\n"
" for (int i=0; i<8 && (ow_idx+i)<output_shape.y; i++) {\n"
" rediceOutput[redice_offset+lid_y*width_height+i*4+lid_x]=redice[i];\n"
" }\n"
" }\n"
"#endif\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void pooling_c16_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int channel,\n"
" __private const int batch,\n"
" __private const int in_channel_block,\n"
" __private const int out_channel_block,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right) {\n"
" \n"
" const int ow_idx=get_global_id(1) << 3;\n"
" const int b_oh_idx=get_global_id(2);\n"
" const int c_idx=get_group_id(0);\n"
" const int sglid=get_sub_group_local_id();\n"
" \n"
" const int b_idx=b_oh_idx/output_shape.x;\n"
" const int oh_idx=b_oh_idx % output_shape.x;\n"
" const int iw_start=mad24(ow_idx,STRIDE_X,-pad_shape.y);\n"
" const int ih_start=mad24(oh_idx,STRIDE_Y,-pad_shape.x);\n"
" const int src_width=input_shape.y+input_pad_left+input_pad_right;\n"
"#ifdef POOL_AVG\n"
" COMPUTE_FLOAT8 result=(COMPUTE_FLOAT8)(0);\n"
" COMPUTE_FLOAT8 w_start=(COMPUTE_FLOAT8)(iw_start,iw_start+STRIDE_X,iw_start+STRIDE_X*2,iw_start+STRIDE_X*3,iw_start+STRIDE_X*4,iw_start+STRIDE_X*5,iw_start+STRIDE_X*6,iw_start+STRIDE_X*7);\n"
"#ifdef COUNT_INCLUDE_PADDING\n"
" COMPUTE_FLOAT8 w_size=fmin(w_start+KERNEL_X,input_shape.y+pad_shape.y)-w_start;\n"
" COMPUTE_FLOAT8 total_count=(COMPUTE_FLOAT8)(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*w_size;\n"
"#else\n"
" w_start=fmax(w_start,(COMPUTE_FLOAT8)0);\n"
" COMPUTE_FLOAT8 w_end=fmin(w_start+KERNEL_X,(COMPUTE_FLOAT8)input_shape.y);\n"
" float h_start=fmax((float)ih_start,0);\n"
" float h_end=fmin(h_start+KERNEL_Y,(float)input_shape.x);\n"
" COMPUTE_FLOAT8 total_count=(w_end-w_start)*(COMPUTE_FLOAT8)(h_end-h_start);\n"
"#endif\n"
"#else\n"
" COMPUTE_FLOAT8 result=(COMPUTE_FLOAT8)(-FLT_MAX);\n"
"#if RETURN_REDICE\n"
" int8 redice=(int8)0;\n"
"#endif\n"
"#endif\n"
" const int inp_offset=mul24(mad24(mad24(mad24(b_idx,in_channel_block,c_idx),input_shape.x,ih_start),src_width,iw_start+input_pad_left),16);\n"
" for(int kh=0; kh<KERNEL_Y; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" FLOAT line_cache[INPUT_LINE_SIZE];\n"
" for (int i=0; i<INPUT_LINE_SIZE; i++) {\n"
" if ((iw_start+i) >= 0 && (iw_start+i)<input_shape.y){\n"
"#ifdef MNN_SUPPORT_FP16\n"
" line_cache[i]=as_half(intel_sub_group_block_read_us((__global ushort*)(input+inp_offset+mul24(mad24(kh,src_width,i),16))));\n"
"#else\n"
" line_cache[i]=as_float(intel_sub_group_block_read((__global uint*)(input+inp_offset+mul24(mad24(kh,src_width,i),16))));\n"
"#endif\n"
" } else{\n"
"#ifdef POOL_AVG\n"
" line_cache[i]=0;\n"
"#else\n"
" line_cache[i]=(FLOAT)(-FLT_MAX);\n"
"#endif\n"
" }\n"
" }\n"
" for(int kw=0; kw<KERNEL_X; kw++) {\n"
" COMPUTE_FLOAT8 src;\n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++) {\n"
" src[i]=line_cache[kw+STRIDE_X*i];\n"
" }\n"
"#ifdef POOL_AVG\n"
" result += src;\n"
"#else\n"
"#if RETURN_REDICE\n"
" redice=src>result ? (int8)((ih_start+kh)*input_shape.y+iw_start+kw) : redice;\n"
"#endif\n"
" result=fmax(result,src);\n"
"#endif\n"
" }\n"
" }\n"
"#ifdef POOL_AVG\n"
" result=result/total_count;\n"
"#endif\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" \n"
" const int out_offset=(((b_idx+c_idx*4*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
" const int batch_width_height=batch*output_shape.y*output_shape.x*4;\n"
"#if RETURN_REDICE\n"
" const int redice_offset=(((b_idx+c_idx*4*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
"#endif\n"
"#if OUTPUT_LEFTOVERS\n"
" if ((c_idx+1)*16 >= channel) {\n"
" for (int i=0; i<8; i++) {\n"
" if ((c_idx*16+lid_y*4+lid_x<channel) && (ow_idx+i)<output_shape.y)\n"
" output[out_offset+lid_y*batch_width_height+i*4+lid_x]=result[i];\n"
"#if RETURN_REDICE\n"
" rediceOutput[redice_offset+lid_y*batch_width_height+i*4+lid_x]=redice[i];\n"
"#endif\n"
" }\n"
" }\n"
" else\n"
"#endif \n"
" {\n"
" for (int i=0; i<8 && (ow_idx+i)<output_shape.y; i++) {\n"
" output[out_offset+lid_y*batch_width_height+i*4+lid_x]=result[i];\n"
"#if RETURN_REDICE\n"
" rediceOutput[redice_offset+lid_y*batch_width_height+i*4+lid_x]=redice[i];\n"
"#endif\n"
" }\n"
" }\n"
"}\n"
;
#endif
#endif
}
